In [1]:
import os
from pandas import DataFrame
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import preprocessing
from sklearn.decomposition import PCA

from scipy.stats import zscore
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
In [2]:
def preparation(data_path, target_path):
    # load data
    df_raw = pd.read_csv(data_path, index_col=False)
    df_raw = df_raw.set_index(['GO term', 'category', 'description'])
    df_raw_target = pd.read_csv(target_path)
    
    # Sum feature
    df_raw['total'] = df_raw.sum(axis=1) / (len(df_raw.index))

    # Sort feature by sum of numbers
    df_raw = df_raw.sort_values('total', ascending=True)
    df_raw = df_raw.drop(columns=['total'])
    return df_raw, df_raw_target

    
def normalizing(df_raw, df_raw_target):
    # normalizing by MinMax Scaler
    min_max_scaler = preprocessing.MinMaxScaler()
    scaled_array = min_max_scaler.fit_transform(df_raw)
    df_norm_minmax = pd.DataFrame(scaled_array, columns=df_raw_target.Sample, index=df_raw.index)

    # normalizing by zscore
    df_norm_zscore = df_raw.apply(zscore)

    data_list = [df_raw, df_norm_minmax, df_norm_zscore]

    # Prepare data
    data_raw = df_raw.T
    data_minmax = df_norm_minmax.T
    data_zscore = df_norm_zscore.T
    

    #target
    y = df_raw_target

    #feature
    x1 = data_raw.values
    x2 = data_minmax.values
    x3 = data_zscore.values

    # Input
    feature = [x1, x2, x3] 
    return data_list, feature, y

def calculate_braycurtis(data):
    df = squareform(pdist(data.T, metric='braycurtis'))
    return df

def calculate_PCA(data, y):
    pca = PCA(n_components=3)
    principalComponents = pca.fit_transform(data)
    principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2', 'principal component 3'])
    finalDf = pd.concat([principalDf, pd.DataFrame(y)], axis=1)
    return finalDf

def visualize_PCA_2D(finalDf):
    fig = plt.figure(figsize = (8,8))
    ax = fig.add_subplot(1,1,1) 
    ax.set_xlabel('Principal Component 1', fontsize = 15)
    ax.set_ylabel('Principal Component 2', fontsize = 15)
    ax.set_title('2 component PCA', fontsize = 20)
    targets = finalDf.Study.unique()
    colors = ['b', 'w', 'r', 'c', 'm', 'y', 'k', 'g']
    for target, color in zip(targets,colors):
        indicesToKeep = finalDf['Study'] == target
        ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50
               , alpha=0.5)
    ax.legend(targets)
    ax.grid()
    return plt.show()

def plot_heatmap(df, title):
    plt.figure(figsize = (10,18))
    sns.set(font_scale=0.7) 
    plt.title(title, fontsize =20)
    sns.heatmap(df, 
                yticklabels=df.index.get_level_values(2),
                cbar_kws={"orientation": "horizontal"})
In [3]:
data_path = 'output/df_raw.csv'
target_path = 'output/df_raw_target.csv'

df_raw, df_raw_target = preparation(data_path, target_path)
data_list, feature, y = normalizing(df_raw, df_raw_target)
In [4]:
data_list[0].T.describe()
Out[4]:
GO term GO:0030031 GO:0016049 GO:0045182 GO:0005773 GO:0070469 GO:0031012 GO:0009295 GO:0002376 GO:0005667 GO:0009404 ... GO:0005215 GO:0016020 GO:0006810 GO:0016740 GO:0003676 GO:0016491 GO:0006807 GO:0009058 GO:0044281 GO:0000166
category biological_process biological_process molecular_function cellular_component cellular_component cellular_component cellular_component biological_process cellular_component biological_process ... molecular_function cellular_component biological_process molecular_function molecular_function molecular_function biological_process biological_process biological_process molecular_function
description cell projection assembly cell growth translation regulator activity vacuole respiratory chain extracellular matrix nucleoid immune system process transcription factor complex toxin metabolic process ... transporter activity membrane transport transferase activity nucleic acid binding oxidoreductase activity nitrogen compound metabolic process biosynthetic process small molecule metabolic process nucleotide binding
count 692.000000 692.000000 692.000000 692.000000 692.000000 692.000000 692.000000 692.000000 692.000000 692.000000 ... 692.000000 692.000000 6.920000e+02 6.920000e+02 6.920000e+02 6.920000e+02 6.920000e+02 6.920000e+02 6.920000e+02 6.920000e+02
mean 0.504335 0.722543 0.923410 3.708092 5.463873 8.911850 10.697977 11.124277 11.781792 11.839595 ... 66196.901734 69129.514451 9.006258e+04 1.077323e+05 1.244876e+05 1.249676e+05 1.374904e+05 1.440681e+05 1.454440e+05 1.822304e+05
std 2.883906 9.813067 12.162012 13.236195 22.861063 72.890235 55.918121 114.436538 39.503577 88.430479 ... 113680.125632 101213.863731 1.524008e+05 1.781995e+05 2.103199e+05 2.068027e+05 2.069738e+05 2.256319e+05 2.542848e+05 3.095884e+05
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000e+00 0.000000e+00 8.000000e+00 0.000000e+00 0.000000e+00 2.700000e+01 0.000000e+00 0.000000e+00
25% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 5665.250000 8709.750000 8.387500e+03 1.107625e+04 1.199400e+04 1.072825e+04 1.593250e+04 1.588250e+04 1.237500e+04 1.565350e+04
50% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 21248.500000 27126.500000 2.945050e+04 3.613150e+04 4.022500e+04 3.896800e+04 5.210400e+04 5.213250e+04 4.168500e+04 5.626950e+04
75% 0.000000 0.000000 0.000000 1.000000 1.000000 1.000000 2.000000 0.000000 4.000000 4.000000 ... 65433.500000 84363.750000 9.245525e+04 1.084270e+05 1.182782e+05 1.298508e+05 1.577045e+05 1.559428e+05 1.393350e+05 1.766330e+05
max 39.000000 201.000000 279.000000 166.000000 366.000000 1455.000000 1136.000000 1850.000000 496.000000 2111.000000 ... 813072.000000 733154.000000 1.183118e+06 1.199649e+06 1.490334e+06 1.440774e+06 1.365008e+06 1.467779e+06 1.752877e+06 2.030833e+06

8 rows × 116 columns

In [5]:
for count, i in enumerate(data_list):
    label = ['raw', 'maxmin', 'zscore']
    select = i.T#.describe()
    #select.loc["count"]
    #select.iloc[:,0].plot.hist()
    select.plot.hist(bins=50, alpha=0.5, legend=False, title=label[count])
In [6]:
dataframe = [[data_list[0], "Not Processed"], 
             [data_list[1], "MinMax Scaled"],
             [data_list[2], "ZScore Scaled"],
            ]

for i in dataframe:
    plot_heatmap(i[0], i[1])
In [7]:
for i in feature:
    finalDf = calculate_PCA(i, y)
    visualize_PCA_2D(finalDf)
In [8]:
df_braycurtis = []

for i in data_list:
    x = calculate_braycurtis(i)
    df_braycurtis.append(x)
In [9]:
for i in df_braycurtis:
    plt.figure(figsize = (5,5))
    sns.set(font_scale=0.7) 
    #plt.title("title", fontsize =20)

    # Generate a mask for the upper triangle
    mask = np.zeros_like(i, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    sns.heatmap(i, 
                mask=mask,
                #yticklabels=df.index.get_level_values(2),
                cbar_kws={"orientation": "horizontal"})
In [10]:
finalDf = calculate_PCA(feature[2], y)

# Set style of scatterplot
sns.set_context("notebook", font_scale=1.1)
sns.set_style("whitegrid")

g = sns.lmplot(x="principal component 1",
               y="principal component 2",
               data=finalDf,
               legend=True,
               palette="Set2",
               fit_reg=False,
               height=10,
               hue='Study',
               scatter_kws={"s":150, "alpha":0.7})
plt.title('PCA Results', weight='bold').set_fontsize('14')
plt.xlabel('Principal Component 1', weight='bold').set_fontsize('10')
plt.ylabel('Principal Component 2', weight='bold').set_fontsize('10')
In [11]:
from sklearn.manifold import TSNE
from time import time

perplexities = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
n_components = 2
tsne_result = []
X = feature[1]
for i, perplexity in enumerate(perplexities):
    t0 = time()
    tsne = TSNE(n_components=n_components, init='random',
                         random_state=0, perplexity=perplexity)
    Y = tsne.fit_transform(X)
    tsne_result.append(Y)
    t1 = time()
    print("z-score, perplexity=%d in %.2g sec" % (perplexity, t1 - t0))

    tSNE_df = pd.DataFrame(data = tsne_result[i], columns = ['component_1', 'component_2'])
    tSNE_df = pd.concat([tSNE_df, pd.DataFrame(y)], axis=1)

    # Set style of scatterplot
    sns.set_context("notebook", font_scale=1.1)
    sns.set_style("whitegrid")

    g = sns.lmplot(x="component_1",
               y="component_2",
               data=tSNE_df,
               legend=True,
               palette="Set2",
               fit_reg=False,
               height=10,
               hue='Study',
               scatter_kws={"s":150, "alpha":0.7})
    plt.title('tSNE Result, perplexity= %d'  % (perplexity), weight='bold').set_fontsize('14') 
    plt.xlabel('Component 1', weight='bold').set_fontsize('10')
    plt.ylabel('Component 2', weight='bold').set_fontsize('10')
    plt.show()
    
z-score, perplexity=5 in 3.3 sec
z-score, perplexity=10 in 3.7 sec
z-score, perplexity=15 in 3.6 sec
z-score, perplexity=20 in 3.9 sec
z-score, perplexity=25 in 4.1 sec
z-score, perplexity=30 in 4.6 sec
z-score, perplexity=35 in 4.8 sec
z-score, perplexity=40 in 4.8 sec
z-score, perplexity=45 in 5 sec
z-score, perplexity=50 in 5.4 sec
In [12]:
perplexities = [2, 5, 30, 50, 100]
n_components = 2
tsne_result = []
X = df_braycurtis[2]
for i, perplexity in enumerate(perplexities):
    t0 = time()
    tsne = TSNE(n_components=n_components, init='random',
                metric="precomputed", random_state=0, perplexity=perplexity)
    Y = tsne.fit_transform(X)
    tsne_result.append(Y)
    t1 = time()
    print("z-score, perplexity=%d in %.2g sec" % (perplexity, t1 - t0))

    tSNE_df = pd.DataFrame(data = tsne_result[i], columns = ['component_1', 'component_2'])
    tSNE_df = pd.concat([tSNE_df, pd.DataFrame(y)], axis=1)

    # Set style of scatterplot
    sns.set_context("notebook", font_scale=1.1)
    sns.set_style("whitegrid")

    g = sns.lmplot(x="component_1",
               y="component_2",
               data=tSNE_df,
               legend=True,
               palette="Set2",
               fit_reg=False,
               height=10,
               hue='Study',
               scatter_kws={"s":150, "alpha":0.7})
    plt.title('tSNE Result, perplexity= %d'  % (perplexity), weight='bold').set_fontsize('14') 
    plt.xlabel('Component 1', weight='bold').set_fontsize('10')
    plt.ylabel('Component 2', weight='bold').set_fontsize('10')
    plt.show()
    
z-score, perplexity=2 in 3.8 sec
z-score, perplexity=5 in 3.3 sec
z-score, perplexity=30 in 4 sec
z-score, perplexity=50 in 6 sec
z-score, perplexity=100 in 7.2 sec
In [ ]: